{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4f991098-08b9-4e8b-912e-ce79879908b9",
   "metadata": {},
   "source": [
    "### Structural variant scores, constraint and conservation \n",
    "\n",
    "Scores:\n",
    "* CADD-SV \n",
    "* PhyloP maximum\n",
    "* gnomAD constraint z-score max\n",
    "* gwRVIS minimum\n",
    "* HARs overlap\n",
    "\n",
    "--- \n",
    "\n",
    "* Logistic regression SV length adjusted and non SV length adjusted. \n",
    "* Model: misexpression-associated(1)/control(0) ~ feature (+ SV length)\n",
    "* Test duplications and deletions separately "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "80713c51-2a86-46cb-8c73-8ff8d022654a",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd \n",
    "import numpy as np\n",
    "from patsy import dmatrices\n",
    "from pathlib import Path\n",
    "from statsmodels.discrete import discrete_model\n",
    "from pybedtools import BedTool\n",
    "from io import StringIO\n",
    "from functools import reduce\n",
    "from statsmodels.stats import multitest"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "c8adb49c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# inputs \n",
    "wkdir = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/misexpression_v3/\"\n",
    "wkdir_path = Path(wkdir)\n",
    "\n",
    "# control and misexpressed SVs \n",
    "all_vrnts_path = wkdir_path.joinpath(\"5_misexp_vrnts/test_cntrl_sets/vrnt_id_in_window_cntrl_misexp_genes.txt\")\n",
    "all_vrnts_bed_path = wkdir_path.joinpath(\"5_misexp_vrnts/test_cntrl_sets/vrnt_id_in_windows_misexp_genes.bed\")\n",
    "misexp_vrnts_path = wkdir_path.joinpath(\"5_misexp_vrnts/test_cntrl_sets/vrnt_id_misexp_tpm_zscore_median.txt\")\n",
    "\n",
    "sv_info_path = \"/lustre/scratch126/humgen/projects/interval_rna/interval_rna_seq/thomasVDS/lof_missense/data/sv_vcf/info_table/final_sites_critical_info_allele.txt\"\n",
    "# score paths \n",
    "cadd_sv_score_info_path = wkdir_path.joinpath(\"5_misexp_vrnts/scores/cadd_sv/scores/intrvl_svs_no_inv_121042_cadd_sv_info.tsv\")\n",
    "phylop_scores_path = wkdir_path.joinpath(\"5_misexp_vrnts/scores/phylop/sv_in_windows_phylop_metrics.tsv\")\n",
    "gwrvis_dir = wkdir_path.joinpath(\"5_misexp_vrnts/scores/gwrvis\")\n",
    "gnomad_constraint_path = wkdir_path.joinpath(\"reference/gnomad/constraint_z_genome_1kb.qc.download.txt.clean\")\n",
    "hars_bed_path = wkdir_path.joinpath(\"reference/conservation/hars/GSE180714_HARs.bed\")\n",
    "\n",
    "# output directory\n",
    "out_dir = wkdir_path.joinpath(\"5_misexp_vrnts/scores/results\")\n",
    "out_dir.mkdir(parents=True, exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "56042387",
   "metadata": {},
   "outputs": [],
   "source": [
    "# constants \n",
    "CHROMOSOMES = ['chr1', 'chr2', 'chr3', 'chr4', 'chr5', 'chr6',\n",
    "               'chr7', 'chr8', 'chr9', 'chr10', 'chr11', 'chr12',\n",
    "               'chr13', 'chr14', 'chr15', 'chr16', 'chr17', 'chr18',\n",
    "               'chr19', 'chr20', 'chr21', 'chr22']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "467bf3d8",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total number of SVs: 20255\n",
      "Number of misexpression-associated SVs: 105\n"
     ]
    }
   ],
   "source": [
    "# load bed file with variants in windows \n",
    "vrnts_in_windows_bed = BedTool(all_vrnts_bed_path)\n",
    "vrnts_in_windows_bed_sorted = vrnts_in_windows_bed.sort()\n",
    "# all variants in windows df\n",
    "all_vrnts_in_windows_df = pd.read_csv(all_vrnts_path, sep=\"\\t\", header=None).rename(columns={0:\"vrnt_id\"})\n",
    "print(f\"Total number of SVs: {len(all_vrnts_in_windows_df.vrnt_id.unique())}\")\n",
    "\n",
    "# load misexpression-associated variants \n",
    "misexp_vrnts_ids = pd.read_csv(misexp_vrnts_path, sep=\"\\t\", header=None)[0].astype(str).unique()\n",
    "print(f\"Number of misexpression-associated SVs: {len(misexp_vrnts_ids)}\")\n",
    "all_vrnts_in_windows_df[\"misexp_uniq\"] = np.where(all_vrnts_in_windows_df.vrnt_id.isin(misexp_vrnts_ids), 1, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fee77178-2fec-440b-a22d-be6c57aaabd7",
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "### CADD-SV\n",
    "vrnts_cadd_sv_scores_df = pd.read_csv(cadd_sv_score_info_path, sep=\"\\t\", dtype={\"vrnt_id\": str})\n",
    "# no inversions \n",
    "vrnts_cadd_sv_df = all_vrnts_in_windows_df.copy()\n",
    "vrnts_cadd_sv_df = pd.merge(vrnts_cadd_sv_df, \n",
    "                             vrnts_cadd_sv_scores_df[[\"vrnt_id\", \"Raw-Score-combined\"]], \n",
    "                             how=\"left\", \n",
    "                             on=\"vrnt_id\")\n",
    "vrnts_cadd_sv_df = vrnts_cadd_sv_df.rename(columns={\"Raw-Score-combined\": \"CADD_sv_raw_score\"})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "8af41365-fb7e-4bfc-b48c-50c953d450b1",
   "metadata": {},
   "outputs": [],
   "source": [
    "### Max PhyloP \n",
    "phylop_df = pd.read_csv(phylop_scores_path, sep=\"\\t\", dtype={\"vrnt_id\": str})\n",
    "vrnt_phylop_df = phylop_df[[\"vrnt_id\", \"phylop_max\"]].copy()\n",
    "vrnt_phylop_df[\"misexp_uniq\"] = np.where(vrnt_phylop_df.vrnt_id.isin(misexp_vrnts_ids), 1, 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "d3478309-f8e7-4e85-936f-766794fd169d",
   "metadata": {},
   "outputs": [],
   "source": [
    "### gnomAD z-score max constraint (max approach) \n",
    "\n",
    "# load GnomAD constraint scores \n",
    "gnomad_constraint = BedTool(gnomad_constraint_path)\n",
    "# intersect with bed file \n",
    "gnomad_bed_intersect_cols = {0:\"vrnt_chrom\", 1:\"vrnt_start\", 2:\"vrnt_end\", 3:\"vrnt_id\", 4:\"window_chrom\", \n",
    "                             5:\"window_start\", 6:\"window_end\", 7:\"window_id\", 8:\"pos\", 9:\"exp\", 10:\"obs\", 11:\"oe\", 12:\"zscore\", 13:\"overlap\"}\n",
    "sv_intersect_gnomad_constraint_str = StringIO(str(vrnts_in_windows_bed_sorted.intersect(gnomad_constraint, wo=True)))\n",
    "sv_intersect_gnomad_constraint_df = pd.read_csv(sv_intersect_gnomad_constraint_str, sep=\"\\t\", header=None).rename(columns=gnomad_bed_intersect_cols).astype({\"vrnt_id\":str})\n",
    "# max constraint score \n",
    "sv_intersect_max_constraint_df = pd.DataFrame(sv_intersect_gnomad_constraint_df.groupby(\"vrnt_id\")[\"zscore\"].max()).reset_index().rename(columns={\"zscore\":f\"gnomad_constraint_max_zscore\"})\n",
    "# around n=5041 do not have an gnomAD z-score annotation \n",
    "vrnt_gnomad_constraint_df = all_vrnts_in_windows_df.copy()\n",
    "vrnt_gnomad_constraint_df = pd.merge(vrnt_gnomad_constraint_df, \n",
    "                                     sv_intersect_max_constraint_df, \n",
    "                                     how=\"outer\", \n",
    "                                     on=\"vrnt_id\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d8fb7722",
   "metadata": {},
   "source": [
    "### gwRVIS "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "074f1f2c",
   "metadata": {},
   "outputs": [],
   "source": [
    "vrnt_gwrvis_df = all_vrnts_in_windows_df.copy()\n",
    "gwrvis_dir_path = Path(gwrvis_dir)\n",
    "gwrvis_score_metrics_list = []\n",
    "for chrom in CHROMOSOMES[:22]: \n",
    "    gwrvis_score_path = gwrvis_dir.joinpath(f\"{chrom}_sv_in_windows_gwrvis.tsv\")\n",
    "    gwrvis_score_metrics_list.append(pd.read_csv(gwrvis_score_path, sep=\"\\t\"))\n",
    "vrnt_gwrvis_scored_full_df = pd.concat(gwrvis_score_metrics_list)\n",
    "vrnt_gwrvis_scored_df = vrnt_gwrvis_scored_full_df[[\"vrnt_id\", \"min\"]].rename(columns={\"min\":\"gwrvis_min\"})\n",
    "vrnt_gwrvis_df = pd.merge(vrnt_gwrvis_df, \n",
    "                          vrnt_gwrvis_scored_df, \n",
    "                          on=\"vrnt_id\",                          \n",
    "                          how=\"left\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a68b19c4-f598-4fae-aee8-aaff8b9b2b6a",
   "metadata": {},
   "source": [
    "### Human accelerated regions (HARs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "bce5db97-9323-4f5b-bf59-8a984d7d39c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# load HARs \n",
    "hars_bed = BedTool(hars_bed_path).sort()\n",
    "bed_intersect_cols_hars = {0:\"vrnt_chrom\", 1:\"vrnt_start\", 2:\"vrnt_end\", 3:\"vrnt_id\", 4:\"har_chrom\", \n",
    "                           5:\"har_start\", 6:\"har_end\", 7:\"har_name\", 8:\"overlap\"}\n",
    "# identify SVs that overlap HARs\n",
    "sv_intersect_hars_str = StringIO(str(vrnts_in_windows_bed_sorted.intersect(hars_bed, wo=True)))\n",
    "sv_intersect_hars_df = pd.read_csv(sv_intersect_hars_str, sep=\"\\t\", header=None).rename(columns=bed_intersect_cols_hars).astype({\"vrnt_id\":str})\n",
    "sv_intersect_hars = sv_intersect_hars_df.vrnt_id.unique()\n",
    "# annotate variants intersecting at least one HAR\n",
    "vrnt_har_intersect_df = all_vrnts_in_windows_df.copy()\n",
    "vrnt_har_intersect_df[\"intersect_har\"] = np.where(vrnt_har_intersect_df.vrnt_id.isin(sv_intersect_hars), 1, 0)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4abbbbee-e890-40ca-b4ad-714422ebf2ea",
   "metadata": {},
   "source": [
    "### Combine Features "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "90e096a9-ee85-4de1-9c3c-247960299357",
   "metadata": {},
   "outputs": [],
   "source": [
    "# merge different features \n",
    "dfs_to_merge = [vrnt_gnomad_constraint_df, \n",
    "                vrnt_phylop_df, \n",
    "                vrnts_cadd_sv_df, \n",
    "                vrnt_har_intersect_df, \n",
    "                vrnt_gwrvis_df, \n",
    "               ]\n",
    "vrnt_features_merged_df = reduce(lambda  left,right: pd.merge(left,right, on=all_vrnts_in_windows_df.columns.tolist(),\n",
    "                                                              how='inner'), dfs_to_merge)\n",
    "# check all variants accounted for \n",
    "if set(vrnt_features_merged_df.vrnt_id.unique()) != set(all_vrnts_in_windows_df.vrnt_id.unique()): \n",
    "    raise ValueError(\"SVs with scores not equal to input set of SVs\")\n",
    "# add structural variant information \n",
    "sv_info_df =pd.read_csv(sv_info_path, sep=\"\\t\", dtype={\"plinkID\": str}).rename(columns={\"plinkID\":\"vrnt_id\"})\n",
    "\n",
    "vrnt_features_merged_info_df = pd.merge(vrnt_features_merged_df, \n",
    "                                       sv_info_df, \n",
    "                                       on=\"vrnt_id\", \n",
    "                                       how=\"left\"\n",
    "                                      )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "368164bc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# write to file \n",
    "features_dir = wkdir_path.joinpath(\"5_misexp_vrnts/scores/features\")\n",
    "features_dir.mkdir(parents=True, exist_ok=True)\n",
    "vrnt_features_out = features_dir.joinpath(\"vrnt_features_scores.csv\")\n",
    "vrnt_features_merged_info_df.to_csv(vrnt_features_out, index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b2ad042d-e9fd-40cc-8e21-968d03f01e3f",
   "metadata": {},
   "source": [
    "### Logistic regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "0ec4ee95-9dc9-415e-a974-8cd76a070028",
   "metadata": {
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DEL gnomad_constraint_max_zscore\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030210\n",
      "         Iterations 10\n",
      "DEL phylop_max\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.029496\n",
      "         Iterations 10\n",
      "DEL CADD_sv_raw_score\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030175\n",
      "         Iterations 10\n",
      "DEL intersect_har\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030441\n",
      "         Iterations 9\n",
      "DEL gwrvis_min\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030523\n",
      "         Iterations 9\n",
      "DUP gnomad_constraint_max_zscore\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.049581\n",
      "         Iterations 9\n",
      "DUP phylop_max\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.047669\n",
      "         Iterations 10\n",
      "DUP CADD_sv_raw_score\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.046789\n",
      "         Iterations 9\n",
      "DUP intersect_har\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.050099\n",
      "         Iterations 9\n",
      "DUP gwrvis_min\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.047910\n",
      "         Iterations 9\n"
     ]
    }
   ],
   "source": [
    "feature_list = [\"gnomad_constraint_max_zscore\", \n",
    "                \"phylop_max\", \n",
    "                \"CADD_sv_raw_score\", \n",
    "                \"intersect_har\",\n",
    "                \"gwrvis_min\"\n",
    "               ] \n",
    "\n",
    "sv_types_logistic_regr_results, logr_count = {}, 0\n",
    "for sv_type in [\"DEL\", \"DUP\"]:\n",
    "    for feature in feature_list:\n",
    "        print(sv_type, feature)\n",
    "        # remove NaNs before normalising feature and length \n",
    "        input_df = vrnt_features_merged_info_df[(vrnt_features_merged_info_df.SVTYPE == sv_type) & \n",
    "                                                (vrnt_features_merged_info_df[feature].notna())].copy()\n",
    "        input_df[\"svlen_norm\"] = (input_df[\"SVLEN\"] - input_df[\"SVLEN\"].mean())/input_df[\"SVLEN\"].std()\n",
    "        input_df[f\"{feature}_norm\"] = (input_df[feature] - input_df[feature].mean())/input_df[feature].std()\n",
    "        y, X = dmatrices(f'misexp_uniq ~ {feature}_norm + svlen_norm', input_df, return_type = 'dataframe')\n",
    "        logit_fit = discrete_model.Logit(endog=y, exog=X).fit()\n",
    "        log_odds, pval = logit_fit.params[1], logit_fit.pvalues[1]\n",
    "        # normal approximation confidence intervals\n",
    "        lower_conf = logit_fit.conf_int(alpha=0.05)[0][1]\n",
    "        upper_conf = logit_fit.conf_int(alpha=0.05)[1][1]\n",
    "        sv_types_logistic_regr_results[logr_count] = [feature, sv_type, log_odds, lower_conf, upper_conf, pval]\n",
    "        logr_count += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "b5d6bd59-821e-4949-91d2-1e39232876bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "columns_logr_enrich = [\"feature\", \"sv_type\", \"log_odds\", \"lower\", \"upper\", \"pval\"]\n",
    "sv_types_logistic_regr_results_df = pd.DataFrame.from_dict(sv_types_logistic_regr_results, orient=\"index\", columns=columns_logr_enrich)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "762403cb-2653-4983-aea3-732a4edcc2fb",
   "metadata": {},
   "source": [
    "### Multiple Testing Correction "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "f6c35499-9464-4a1c-bd63-77180d8ae4d7",
   "metadata": {},
   "outputs": [],
   "source": [
    "pval_list = sv_types_logistic_regr_results_df.pval.to_numpy()\n",
    "# multiple testing correction\n",
    "reject, pvals_corrected, _, _ = multitest.multipletests(pval_list, alpha=0.05, method=\"fdr_bh\")\n",
    "sv_types_logistic_regr_results_df[\"pass_fdr_bh\"] = reject\n",
    "sv_types_logistic_regr_results_df[\"pvals_corrected_fdr_bh\"] = pvals_corrected\n",
    "# Bonferroni correction \n",
    "reject, pvals_corrected, _, _ = multitest.multipletests(pval_list, alpha=0.05, method=\"bonferroni\")\n",
    "sv_types_logistic_regr_results_df[\"pass_bonf\"] = reject\n",
    "sv_types_logistic_regr_results_df[\"pvals_corrected_bonf\"] = pvals_corrected"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7be175d1",
   "metadata": {},
   "source": [
    "### Significant results "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "372c67ba",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DEL phylop_max 0.44666972739102306 1.679340729976719e-05\n",
      "DEL CADD_sv_raw_score 0.33270878595821457 0.009353413348014827\n",
      "DUP gnomad_constraint_max_zscore 0.9162232906963718 0.0023672342992138978\n",
      "DUP phylop_max 1.1047959551430508 0.017388866221444556\n",
      "DUP CADD_sv_raw_score 0.7566582574955489 0.0012445445904313093\n",
      "DUP gwrvis_min -0.7881166642483461 0.01871011782952506\n"
     ]
    }
   ],
   "source": [
    "# use Bonferroni to assign significance \n",
    "for index, row in sv_types_logistic_regr_results_df.iterrows(): \n",
    "    pass_bonf = row[\"pass_bonf\"]\n",
    "    if pass_bonf: \n",
    "        print(row[\"sv_type\"], row[\"feature\"], row[\"log_odds\"], row[\"pvals_corrected_bonf\"])"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6c8f28e1",
   "metadata": {},
   "source": [
    "### Write to file "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "17552bb2",
   "metadata": {},
   "outputs": [],
   "source": [
    "# names for different scores \n",
    "features_to_keep = { 'CADD_sv_raw_score': \"CADD-SV\",\n",
    "                    'phylop_max': \"Conservation\",\n",
    "                    \"gnomad_constraint_max_zscore\": \"gnomAD constraint\",\n",
    "                    'gwrvis_min': \"gwRVIS\",\n",
    "                    'intersect_har': \"HARs\"\n",
    "                   }\n",
    "sv_types_logistic_regr_results_features_to_keep_df = sv_types_logistic_regr_results_df[sv_types_logistic_regr_results_df.feature.isin(features_to_keep.keys())].copy()\n",
    "sv_types_logistic_regr_results_features_to_keep_df[\"feature_name\"] = sv_types_logistic_regr_results_features_to_keep_df.feature.replace(features_to_keep)\n",
    "# write to file \n",
    "features_to_keep_path = out_dir.joinpath(\"misexp_sv_scores_enrich.tsv\")\n",
    "sv_types_logistic_regr_results_features_to_keep_df.to_csv(features_to_keep_path, sep=\"\\t\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0ad64f91",
   "metadata": {},
   "source": [
    "### Non length-adjusted enrichment "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "478e8db4",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DEL gnomad_constraint_max_zscore\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030370\n",
      "         Iterations 10\n",
      "DEL phylop_max\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.029527\n",
      "         Iterations 10\n",
      "DEL CADD_sv_raw_score\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030243\n",
      "         Iterations 10\n",
      "DEL intersect_har\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030529\n",
      "         Iterations 9\n",
      "DEL gwrvis_min\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.030652\n",
      "         Iterations 9\n",
      "DUP gnomad_constraint_max_zscore\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.050087\n",
      "         Iterations 9\n",
      "DUP phylop_max\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.048114\n",
      "         Iterations 10\n",
      "DUP CADD_sv_raw_score\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.048573\n",
      "         Iterations 9\n",
      "DUP intersect_har\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.053087\n",
      "         Iterations 9\n",
      "DUP gwrvis_min\n",
      "Optimization terminated successfully.\n",
      "         Current function value: 0.048408\n",
      "         Iterations 9\n"
     ]
    }
   ],
   "source": [
    "sv_types_logistic_regr_results_no_len_adj, logr_count = {}, 0\n",
    "for sv_type in [\"DEL\", \"DUP\"]:\n",
    "    for feature in feature_list:\n",
    "        print(sv_type, feature)\n",
    "        # remove NaNs before normalising feature\n",
    "        input_df = vrnt_features_merged_info_df[(vrnt_features_merged_info_df.SVTYPE == sv_type) & \n",
    "                                                (vrnt_features_merged_info_df[feature].notna())].copy()\n",
    "        input_df[f\"{feature}_norm\"] = (input_df[feature] - input_df[feature].mean())/input_df[feature].std()\n",
    "        y, X = dmatrices(f'misexp_uniq ~ {feature}_norm', input_df, return_type = 'dataframe')\n",
    "        logit_fit = discrete_model.Logit(endog=y, exog=X).fit()\n",
    "        log_odds, pval = logit_fit.params[1], logit_fit.pvalues[1]\n",
    "        # normal approximation confidence intervals\n",
    "        lower_conf = logit_fit.conf_int(alpha=0.05)[0][1]\n",
    "        upper_conf = logit_fit.conf_int(alpha=0.05)[1][1]\n",
    "        sv_types_logistic_regr_results_no_len_adj[logr_count] = [feature, sv_type, log_odds, lower_conf, upper_conf, pval]\n",
    "        logr_count += 1\n",
    "sv_types_logistic_regr_results_no_len_adj_df = pd.DataFrame.from_dict(sv_types_logistic_regr_results_no_len_adj, orient=\"index\", columns=columns_logr_enrich)\n",
    "# multiple testing correction\n",
    "pval_list = sv_types_logistic_regr_results_no_len_adj_df.pval.to_numpy()\n",
    "reject, pvals_corrected, _, _ = multitest.multipletests(pval_list, alpha=0.05, method=\"fdr_bh\")\n",
    "sv_types_logistic_regr_results_no_len_adj_df[\"pass_fdr_bh\"] = reject\n",
    "sv_types_logistic_regr_results_no_len_adj_df[\"pvals_corrected_fdr_bh\"] = pvals_corrected\n",
    "# Bonferroni correction \n",
    "reject, pvals_corrected, _, _ = multitest.multipletests(pval_list, alpha=0.05, method=\"bonferroni\")\n",
    "sv_types_logistic_regr_results_no_len_adj_df[\"pass_bonf\"] = reject\n",
    "sv_types_logistic_regr_results_no_len_adj_df[\"pvals_corrected_bonf\"] = pvals_corrected"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "d6ed73d9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DEL phylop_max 0.4761120056705365 6.708136427577256e-07\n",
      "DEL CADD_sv_raw_score 0.3769367363528046 0.0008904918450674706\n",
      "DUP gnomad_constraint_max_zscore 1.04330734362084 4.1522957075523265e-05\n",
      "DUP phylop_max 1.245842393354341 0.0018126293744282623\n",
      "DUP CADD_sv_raw_score 0.7856343905682212 0.0001574599372360728\n",
      "DUP gwrvis_min -0.925483321184434 0.00030305689651603573\n"
     ]
    }
   ],
   "source": [
    "# use Bonferroni to assign significance \n",
    "for index, row in sv_types_logistic_regr_results_no_len_adj_df.iterrows(): \n",
    "    pass_bonf = row[\"pass_bonf\"]\n",
    "    if pass_bonf: \n",
    "        print(row[\"sv_type\"], row[\"feature\"], row[\"log_odds\"], row[\"pvals_corrected_bonf\"])\n",
    "# add feature names         \n",
    "sv_types_logistic_regr_results_no_len_adj_features_to_keep_df = sv_types_logistic_regr_results_no_len_adj_df[sv_types_logistic_regr_results_no_len_adj_df.feature.isin(features_to_keep.keys())].copy()\n",
    "sv_types_logistic_regr_results_no_len_adj_features_to_keep_df[\"feature_name\"] = sv_types_logistic_regr_results_no_len_adj_df.feature.replace(features_to_keep)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "dc11d9df",
   "metadata": {},
   "outputs": [],
   "source": [
    "features_to_keep_no_len_adj_path = out_dir.joinpath(\"misexp_sv_scores_enrich_no_len_adj.tsv\")\n",
    "sv_types_logistic_regr_results_no_len_adj_features_to_keep_df.to_csv(features_to_keep_no_len_adj_path, sep=\"\\t\", index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
